Into That Dark Night

home *** CD-ROM | disk | FTP | other *** search

/ Into That Dark Night / Into That Dark Night.iso / pc / YadVashem / Database / WorkDir / ICONtml.PL < prev next >

Wrap

Text File | 1999-01-20 | 16.8 KB | 457 lines

# Parsing The Timeline files of and finding keywords etc. # Written: Eli Zvuluny - # Possible Worlds # # First version - 17/1/99 # # # The lex values is taken from the file lexItems.txt generated by access. # require 'StandardFile.pl'; # First start to process all files in the current folder for an existance of # keywords (i.e. for a lexicon entries) use Cwd; use English; $keyFile = "lexItems.txt"; # file name generated by access, in the directory "workDir" open (ERRFILE, ">keyErr.lex"); print "==== Processing the keywords (lexItems.txt) file =====\n"; open (KEYFILE, "<$keyFile") or die "can't open file $keyFile\n"; while (<KEYFILE>) { $current_line = $_; chomp; ($tmpNum, $tmpKey) = $current_line =~ /^(\d+),(.+)$/; $tmpKey =~ tr/a-z/A-Z/; # in case of multi words, extract extra blanks @tmpArr = split(/\s+/,$tmpKey); $tmpKey = join(" ",@tmpArr); # print "Before Keys $tmpKey\n"; ($tmpKey, $parenKey) = $tmpKey =~ /^([^]+[^\s])\s*($[^\($].*\))*\s*$/; # print "Keys $tmpKey, $parenKey\n"; chop($tmpKey) if $tmpKey =~ /[\,\.\:]$/; # extract possible extra characters if ($lexItems{$tmpKey}) { print "*** The lexicon item: [$tmpKey] already exists\n"; print ERRFILE "*** The lexicon item: [$tmpKey] already exists\n"; } else { $lexItems{$tmpKey} = $tmpNum; $lexNums{$tmpNum} = $tmpKey; # and the inverted list } if ($parenKey) { $parenKey =~ s/$(.*)$/\1/; chop($parenKey) if $parenKey =~ /[\,\.\:]$/; # extract possible extra characters if ($lexItems{$parenKey}) { print "*** The lexicon item: [$parenKey] already exists\n"; print ERRFILE "*** The lexicon item: [$parenKey] already exists\n"; } else { $lexItems{$parenKey} = $tmpNum; $lexNums{$tmpNum} = $tmpKey; # and the inverted list } } } # while KEYFILE close (KEYFILE); foreach $key (sort (keys (%lexItems))) { print "$key $lexItems{$key}\n"; # check if there is a paired keywords separated with a "," if ($key =~ /^([^\,]+)\s*\,\s*([^\,]+)(\s*$.*$)*$/) { # print "paired - $1 - $2 $key, $lexItems{$key}\n"; $pairedItems{$1} = $2; $pairedItems{$2} = $1; $pairLexKey{$1 . " " . $2} = $lexItems{$key}; $pairLexKey{$2 . " " . $1} = $lexItems{$key}; # print "paired - $1 - $2 $key, $lexItems{$key}\n"; if (!$lexItems{$1}) { # the If enforce the existance of a normal entry which = first part of pair $lexItems{$1} = $lexItems{$key}; # this will be the last choice for a pair element. # $lexNums{$idx-1} = $1; } $firstPaired{$lexItems{$key}} = $1; # used for a case of multipled paired entries (such as # (Kaplan, Moshe - Kaplan, Shimon), in those cases, on the second entry (Shimon), when # there is a simple reference to Kaplan (a lot), the system mark it by mistake since # it assumes that it refers to the first Kaplan, and not to the titled keyword. } else { #if it is a multiword, save the first word, and max number of words for that item. @tmpArr = split(/\s+/,$key); if ($#tmpArr > 1) { $tmpVal = $firstWord{$tmpArr[0]}; if ($#tmpArr > $tmpVal) { $firstWord{$tmpArr[0]} = $#tmpArr+1; } # print " Multiword $Key, $tmpArr[0], $firstWord{$tmpArr[0]}\n"; } } #if - elsif if (($beforeParen, $inParen) = $key =~ /^\s*([^\s]+)\s*$(.+)\s*$$/) { chop($beforeParen) if $beforeParen =~ /[\,\.\:]$/; # extract possible extra characters chop($inParen) if $inParen =~ /[\,\.\:]$/; # extract possible extra characters if (!$lexItems{$inParen}) { $lexItems{$inParen} = $lexItems{$key}; } if (!$lexItems{$beforeParen}) { $lexItems{$beforeParen} = $lexItems{$key}; } } # there is another keyword in Parentheses } #foreach $dirname = "lexFiles"; $currentDir = cwd; # print "$currentDir \n"; # printKeyArr (\%pairLexKey); # now get the base dir, which is the parent directory, and the directory separator ($dirSep,$lastDir) = $currentDir =~ /([\\\:])([^\\\:]+)$/; # $baseDir = $`; $baseDir = $PREMATCH; @dirVec = ("TLtext"); foreach $theDir (@dirVec) { $dirname = "$baseDir$dirSep" . "Text" . "$dirSep$theDir"; mkdir ("${dirname}1", umask()); opendir(CURDIR, $dirname) or die "can't open $dirname\n"; chdir($dirname); $lastKey = ""; $oneLinerName = "oneLiner.txt"; open (ONELINER, ">${dirname}1$dirSep$oneLinerName"); print "==== Start processing the Timeline files in $theDir =====\n"; while ($curFile = readdir(CURDIR)) { if ($curFile =~ /tl(\d\d\d\d\d\d).doc/i) { $evDate = $1; print "-----$curFile----\n"; $veryBigLine = ""; open (FILEIN, "<$curFile") or die "can't open file\n"; open (TXTFILE, ">${dirname}1$dirSep$curFile"); $oneLineMode = 1; $firstPartLine = ""; while (<FILEIN>) { $current_line = $_; if ($oneLineMode) { chomp $current_line; if ($current_line =~ /\@\@\@/) { $oneLineMode = 0; print ONELINER "$evDate\|$firstPartLine\n"; } else { $firstPartLine .= $current_line; } } else { $veryBigLine = $veryBigLine . $current_line; } #if onlinemode } # while # extract key number from file name ($fileIdx) = $curFile =~ /txt(\d+)\.doc/i; # set the value of lastKey, which is the key associated with the file. $lastKey = $lexNums{$fileIdx}; # print "Last key is $lastKey\n"; if (!$lastKey) { $lastKey = "stamstamstam"; } &closeKeyItem($lastKey); # handle last keyword close (FILEIN); } #if } # while } # foreach @dirVec close (KEYFILE); close (ERRFILE); close (ONELINER); print "==== Successful processing =====\n"; sub closeKeyItem { local($curKey) = @_; if ($curKey eq "") { # print "Nothing to do now (really) !!!!!\n"; } else { $veryBigLine =~ s/\n/ # /g; # now replace all the Bold and Italic with @b and @i. while (($HtmlTag,$innerString) = $veryBigLine =~ /<([IB])>([^[<>]*)<\/\1>/i) { # found, replace all the internal words with @i@ or @b@ $beforeStr = $PREMATCH; $afterStr = $POSTMATCH; @markedWords = split(/\s+/,$innerString); # print " $HtmlTag,$innerString Number is $#markedWords\n"; if ($#markedWords >= 0) { for ($i = 0; $i <= $#markedWords; $i++) { if ($markedWords [$i] !~ /^\s*$/) { # print "$markedWords[$i]\n"; if ($markedWords [$i] ne "#") { $markedWords [$i] = '@' . $HtmlTag . '@' . $markedWords [$i]; } } } # for } # if there is something between tag and its closure $veryBigLine = $beforeStr . " " . join(" ",@markedWords) . " " . $afterStr; } # while &handleLines($lastKey, $veryBigLine); $veryBigLine = join (" ",@splitWords); $veryBigLine =~ s/\s*\007\s*/\007/g; $veryBigLine =~ s/\007/\n/g; print TXTFILE $veryBigLine; $veryBigLine = ""; close (TXTFILE); } } # closeKeyItem sub handleLines { local($curKey, $theLine) = @_; my $i, $tmpWord; # first replace the new line with another symbol $theLine =~ s/\n/\007/g; @splitWords = split(/\s+/,$theLine); # The algorithm for finding if a keyword already exist: # First of all naturally translate all words into uppercase characters. # The priorities for a keyword is: # 1. Key word that contains couple of words. # 2. Key words that compose of 2 parts weparated by ",". <part1>,<part2> # The possible combinations are: <part1> <part2> | <part2> <part1> | <part1> # 3. Single word that did not fulfill any of the previous criterias # # on All cases, we may try to use the canonic version of the word. # in all cases we do not look for the current file keyword. $prevWords = ""; $numOfWords = 0; # print "split words $#splitWords\n"; for ($i = 0; $i <= $#splitWords; $i++) { $tmpWord = $splitWords[$i]; $tmpWord =~ tr/a-z/A-Z/; $tmpWord =~ s/^\@[IB]\@(.*)$/\1/i; # if ($tmpWord =~ /^<[\/]?b>/i) { # print "$tmpWord\n"; # } # $tmpWord =~ s/^<[\/]?b>(.+)<[\/]?b>$/\1/i; # as said let's first check if the keyword is part of a multiword item # fast seach for firstword existance: if ($keyLength = $firstWord{$tmpWord}) { if (&checkManyWords($keyLength, $i)) { # print "Succesful match of multi words\n"; $i += $keyLength-1; # skip all matched words ($i was already incremented by 1) } } # if there is a chance for a multiword key. elsif ($keyLength = &checkPairedWords($i)) { # now look of a paired keywords # print "Succesful match of paired words\n"; $i += $keyLength-1; # skip all matched words } elsif ($keyLength = &checkOne($i)) { # print "Succesful match of Single words\n"; # no need to skip all matched words } else { &checkPluralEtc($i); } # $scanKey = ($prevWords ne "") ? $prevWords . " " . $tmpWord : $tmpWord; # if ($lexItems{$scanKey} && $scanKey ne $curKey) { # print " Word found: $scanKey, in text -- $curKey\n"; # @keyWord = split(/\s+/,$scanKey); # if ($#keyWord > 1) { # print " good results found it\n"; # $prevWords = ""; # $numOfWords = 0; # } # keywords has more than 1 word # } #found exactly as appreas in master keywords # elsif ($firstWord{$tmpWord} < 2) { # $prevWords = ""; # $numOfWords = 0; # } # else { # $prevWords = $prevWords . " " . $tmpWord; # $numOfWords++; # print " a long prev word found --- $prevWords -- $numOfWords\n"; # # } # } # for } sub checkManyWords { local ($numOfWords, $curIndex) = @_; my $tmpIdx, $tmpStr; my $i; $tmpIdx = $curIndex + $numOfWords; $tmpStr = $splitWords[$curIndex]; for ($i = 1; $i < $numOfWords; $i++) { # concatenate the next $numOfWords into one keyword $tmpStr = $tmpStr . " " . $splitWords[$curIndex+$i]; } # for $tmpStr =~ tr/a-z/A-Z/; $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig; chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/; chop($tmpStr) if $tmpStr =~ /\)$/; if ($tmpStr =~ /^\(/) { $tmpStr = substr ($tmpStr, 1); } if ($tmpStr =~ /^(.*)\'s$/i) { $tmpStr = $1; } $theIndex = $lexItems{$tmpStr}; # print " Value of index $lexItems{$tmpStr}\n" if $theIndex; if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx))) { # for ($i = 0; $i < $numOfWords; $i++) { # $splitWords[$curIndex+$i] =~ s/^/\@$theIndex\@/; # print "$splitWords[$curIndex+$i] "; # This was the old method # $splitWords[$curIndex] =~ s/^/\<$theIndex\>/; # $splitWords[$curIndex + $numOfWords-1] =~ s/$/\<\/$theIndex\>/; # print "Added tag in checkManyWords - $splitWords[$curIndex]\n"; for ($i = $curIndex; $i < $curIndex+$numOfWords ; $i++) { $splitWords[$i] =~ s/^/\@\+$theIndex\@/; } # } # for # print "\n"; } $retVal = $lexItems{$tmpStr}; } # checkManyWords sub checkPairedWords { local ($curIndex) = @_; my $tmpStr, $secondPair; $tmpStr = $splitWords[$curIndex]; $tmpStr =~ tr/a-z/A-Z/; $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig; if ($secondPair = $pairedItems{$tmpStr}) { # print "=-=-=-= $secondPair = $splitWords[$curIndex] =-= $tmpStr\n"; @tmpArr = split(/\s+/,$secondPair); $arrLen = $#tmpArr ? $#tmpArr : 1; if (&checkManyPair($arrLen+1, $curIndex)) { # print "!!!!Succesful match of First Paired words\n"; $retVal = $arrLen+1; } else { $retVal = 0; } } else { $retVal = 0; } } sub checkManyPair { local ($numOfWords, $curIndex) = @_; my $tmpIdx, $tmpStr; my $i; $tmpIdx = $curIndex + $numOfWords; $tmpStr = $splitWords[$curIndex]; for ($i = 1; $i < $numOfWords; $i++) { # concatenate the next $numOfWords into one keyword $tmpStr = $tmpStr . " " . $splitWords[$curIndex+$i]; } # for # print " in CheckManyPair $tmpStr\n"; $tmpStr =~ tr/a-z/A-Z/; $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig; # print " after CheckManyPair $tmpStr\n"; chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/; chop($tmpStr) if $tmpStr =~ /\)$/; if ($tmpStr =~ /^\(/) { $tmpStr = substr ($tmpStr, 1); } if ($tmpStr =~ /^(.*)\'s$/i) { $tmpStr = $1; } $theIndex = $pairLexKey{$tmpStr}; # print " after CheckManyPair the Index $theIndex,$tmpStr\n"; # print " Value of index (in pair) $pairLexKey{$tmpStr}\n" if $theIndex; if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx))) { # for ($i = 0; $i < $numOfWords; $i++) { # $splitWords[$curIndex+$i] =~ s/^/\@$theIndex\@/; # This was the old method # $splitWords[$curIndex] =~ s/^/\<$theIndex\>/; # $splitWords[$curIndex + $numOfWords-1] =~ s/$/\<\/$theIndex\>/; for ($i = $curIndex; $i < $curIndex+$numOfWords ; $i++) { $splitWords[$i] =~ s/^/\@\+$theIndex\@/; } # print "Added tag in checkManyPairs - $splitWords[$curIndex]\n"; # print "$splitWords[$curIndex+$i] "; # } # for # print "\n"; } $retVal = $theIndex; } # checkManyPair sub checkOne { local ($curIndex) = @_; my $tmpStr; $tmpStr = $splitWords[$curIndex]; $tmpStr =~ tr/a-z/A-Z/; $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig; chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/; chop($tmpStr) if $tmpStr =~ /\)$/; if ($tmpStr =~ /^$/) { $tmpStr = substr ($tmpStr, 1); } if ($tmpStr =~ /^(.*)\'S$/) { $tmpStr = $1; } $theIndex = $lexItems{$tmpStr}; # print " Value of index $lexItems{$tmpStr}\n" if $theIndex; # if ($theIndex && $firstPaired{$fileIdx} =~ /\b$tmpStr\b/) { # print " match first pair in file $fileIdx, entry $tmpStr - $firstPaired{$fileIdx}\n"; # } if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx)) && ($firstPaired{$fileIdx} !~ /\b$tmpStr\b/)) { # $splitWords[$curIndex] =~ s/^/\@$theIndex\@/; # This was the old method # $splitWords[$curIndex] =~ s/^(.*)$/\<$theIndex\>\1\<\/$theIndex\>/; $splitWords[$curIndex] =~ s/^/\@\+$theIndex\@/; # print "Added tag in checkOne - $splitWords[$curIndex]\n"; # print "$splitWords[$curIndex]\n "; $retVal = 1; } else { $retVal = 0; } } # checkOne sub checkPluralEtc { local ($curIndex) = @_; my $tmpStr; $tmpStr = $splitWords[$curIndex]; $tmpStr =~ tr/a-z/A-Z/; $tmpStr =~ s/\@[IB]\@([^\@]*)/\1/ig; chop($tmpStr) if $tmpStr =~ /[\,\.\:]$/; chop($tmpStr) if $tmpStr =~ /$$/; if ($tmpStr =~ /^\(/) { $tmpStr = substr ($tmpStr, 1); } if ($tmpStr =~ /^(.*)\'S$/) { $tmpStr = $1; } elsif ($tmpStr =~ /^(.*)IES$/) { $tmpStr = $1 . "Y"; } elsif ($tmpStr =~ /^(.*)ES$/) { $tmpStr = $1; } elsif ($tmpStr =~ /S$/) { # regular plural chop($tmpStr); } else { return 0; } $theIndex = $lexItems{$tmpStr}; # print " Value of index(Plural) $lexItems{$tmpStr}\n" if $theIndex; if ($theIndex && (($theDir ne "Lexicon") || ($theIndex != $fileIdx)) && ($firstPaired{$fileIdx} !~ /\b$tmpStr\b/)) { # This was the old method # $splitWords[$curIndex] =~ s/^(.*)$/\<$theIndex\>\1\<\/$theIndex\>/; $splitWords[$curIndex] =~ s/^/\@\+$theIndex\@/; # print "Added tag in checkPlural Etc. - $splitWords[$curIndex]\n"; # $splitWords[$curIndex] =~ s/^/\@$theIndex\@/; # print "(Plural) $splitWords[$curIndex]\n "; $retVal = 1; } else { $retVal = 0; } } # checkPluralEtc sub printKeyArr { local($arRef) = @_; foreach $key (keys (%$arRef)) { print " The Key $key and its val $$arRef{$key}\n"; } }